{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## HR analysis\n",
    "### 标注是left，离散值，所以属于分类问题\n",
    "基于对HR.csv的特征探索分析基础，我们这里直接建模"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import scipy.stats as ss\n",
    "import matplotlib.pyplot as plt\n",
    "from sklearn.preprocessing import MinMaxScaler, StandardScaler\n",
    "from sklearn.preprocessing import LabelEncoder, OneHotEncoder\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.discriminant_analysis import LinearDiscriminantAnalysis\n",
    "# 验证数据的准确率，召回率，F值\n",
    "from sklearn.metrics import accuracy_score, recall_score, f1_score\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 特征处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def hr_preprocessing(sl=False, le=False, npr=False, amh=False, tsc=False, wa=False, \n",
    "                     pl=False, dep=False, sal=False,lower_d=False, ld_n=1):\n",
    "    def map_salary(s):\n",
    "        d = dict([('low', 0), ('medium', 1), ('high', 2)])\n",
    "        return d.get(s, 0)\n",
    "    \n",
    "    df = pd.read_csv(\"HR.csv\")\n",
    "    \n",
    "    # 清洗数据\n",
    "    df = df.dropna(subset=['satisfaction_level', 'last_evaluation'])\n",
    "    df = df[df['satisfaction_level']<=1][df['salary']!='nme']\n",
    "    # 标注\n",
    "    label = df['left']\n",
    "    df=df.drop('left', axis=1)\n",
    "    # 特征选择\n",
    "    # 特征处理\n",
    "    scaler_lst = [sl, le, npr, amh, tsc, wa, pl]\n",
    "    column_lst = ['satisfaction_level', 'last_evaluation', 'number_project', 'average_monthly_hours', \n",
    "                  'time_spend_company', 'Work_accident', 'promotion_last_5years']\n",
    "    for i in range(len(scaler_lst)):\n",
    "        if not scaler_lst[i]:\n",
    "            df[column_lst[i]] = MinMaxScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1)).reshape(1, -1)[0]\n",
    "        else:\n",
    "            df[column_lst[i]] = StandardScaler().fit_transform(df[column_lst[i]].values.reshape(-1, 1)).reshape(1, -1)[0]\n",
    "    # 对离散值处理\n",
    "    scaler_lst2 = [sal, dep]\n",
    "    column_lst2 = ['salary', 'department']\n",
    "    for i in range(len(scaler_lst2)):\n",
    "        if not scaler_lst2[i]:\n",
    "            if column_lst2[i] == 'salary':\n",
    "                df[column_lst2[i]] = [map_salary(s) for s in df['salary'].values]\n",
    "            else:\n",
    "                df[column_lst2[i]] = LabelEncoder().fit_transform(df[column_lst2[i]].values.reshape(-1, 1))\n",
    "            # 统一归一化处理\n",
    "            df[column_lst2[i]] = MinMaxScaler().fit_transform(df[column_lst2[i]].values.reshape(-1, 1))\n",
    "        else:\n",
    "            df = pd.get_dummies(df, columns=[column_lst2[i]])\n",
    "    if lower_d:\n",
    "        # 如果为True，使用PCA降维\n",
    "        return PCA(n_components=ld_n).fit_transform(df.values), label\n",
    "    return df, label"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 建模"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8999 3000 3000\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/label.py:235: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
      "  y = column_or_1d(y, warn=True)\n"
     ]
    }
   ],
   "source": [
    "def hr_modeling(features, label):\n",
    "    from sklearn.model_selection import train_test_split\n",
    "    f_v = features.values\n",
    "    l_v = label.values\n",
    "    # 训练集，验证集，Y为标注\n",
    "    X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)\n",
    "    # 从训练集再切割0.25为测试集\n",
    "    X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)\n",
    "    print(len(X_train), len(X_validation), len(X_test))\n",
    "features, label = hr_preprocessing()\n",
    "hr_modeling(features, label)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "训练集，验证集，测试集长度比为 3：1：1，符合预期"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ACC: 0.9516666666666667\n",
      "REC: 0.9127423822714681\n",
      "F-Score: 0.9008885850991114\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/label.py:235: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
      "  y = column_or_1d(y, warn=True)\n"
     ]
    }
   ],
   "source": [
    "def hr_modeling(features, label):\n",
    "    from sklearn.model_selection import train_test_split\n",
    "    f_v = features.values\n",
    "    l_v = label.values\n",
    "    # 训练集，验证集，Y为标注\n",
    "    X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)\n",
    "    # 从训练集再切割0.25为测试集\n",
    "    X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)\n",
    "    from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier\n",
    "    # n_neighbors=3 结果最好\n",
    "    knn_clf = KNeighborsClassifier(n_neighbors=3)\n",
    "    knn_clf.fit(X_train, Y_train)\n",
    "    Y_predict = knn_clf.predict(X_validation)\n",
    "    # 比较实际验证集和预测值\n",
    "    print(\"ACC:\", accuracy_score(Y_validation, Y_predict))\n",
    "    print(\"REC:\", recall_score(Y_validation, Y_predict))\n",
    "    print(\"F-Score:\", f1_score(Y_validation, Y_predict))\n",
    "features, label = hr_preprocessing()\n",
    "hr_modeling(features, label)    "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 接下来多尝试几个分类器"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/label.py:235: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
      "  y = column_or_1d(y, warn=True)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "KNN  ACC: 0.9738859873319258\n",
      "KNN  REC: 0.9598893499308437\n",
      "KNN  F: 0.9465787679017958\n",
      "1\n",
      "KNN  ACC: 0.954\n",
      "KNN  REC: 0.9337175792507204\n",
      "KNN  F: 0.9037656903765692\n",
      "2\n",
      "KNN  ACC: 0.9563333333333334\n",
      "KNN  REC: 0.9519774011299436\n",
      "KNN  F: 0.9114266396213658\n"
     ]
    }
   ],
   "source": [
    "def hr_modeling(features, label):\n",
    "    from sklearn.model_selection import train_test_split\n",
    "    f_v = features.values\n",
    "    l_v = label.values\n",
    "    # 训练集，验证集，Y为标注\n",
    "    X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)\n",
    "    # 从训练集再切割0.25为测试集\n",
    "    X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)\n",
    "    from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier\n",
    "    models = []\n",
    "    models.append(('KNN', KNeighborsClassifier(n_neighbors=3)))\n",
    "    for clf_name, clf in models:\n",
    "        clf.fit(X_train, Y_train)\n",
    "        xy_lst = [(X_train, Y_train), (X_validation, Y_validation), (X_test, Y_test)]\n",
    "        for i in range(len(xy_lst)):\n",
    "            X_part = xy_lst[i][0]\n",
    "            Y_part = xy_lst[i][1]\n",
    "            Y_predict = clf.predict(X_part)\n",
    "            print(i)\n",
    "            print(clf_name, ' ACC:',accuracy_score(Y_part, Y_predict))\n",
    "            print(clf_name, ' REC:', recall_score(Y_part, Y_predict))\n",
    "            print(clf_name, ' F:',f1_score(Y_part, Y_predict))\n",
    "\n",
    "features, label = hr_preprocessing()\n",
    "hr_modeling(features, label) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "0为训练集，1为验证集，2为测试集分别对应的准确率，召回率，F值。接下来我们把所有尝试的分类器写到一个函数里"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/preprocessing/label.py:235: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().\n",
      "  y = column_or_1d(y, warn=True)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "KNN  ACC: 0.9747749749972219\n",
      "KNN  REC: 0.9561567164179104\n",
      "KNN  F: 0.9475387104229258\n",
      "1\n",
      "KNN  ACC: 0.953\n",
      "KNN  REC: 0.9208333333333333\n",
      "KNN  F: 0.9038854805725972\n",
      "2\n",
      "KNN  ACC: 0.9513333333333334\n",
      "KNN  REC: 0.9151343705799151\n",
      "KNN  F: 0.898611111111111\n",
      "0\n",
      "GaussianNB  ACC: 0.7968663184798311\n",
      "GaussianNB  REC: 0.7252798507462687\n",
      "GaussianNB  F: 0.6298096395301741\n",
      "1\n",
      "GaussianNB  ACC: 0.8116666666666666\n",
      "GaussianNB  REC: 0.7486111111111111\n",
      "GaussianNB  F: 0.6561168594035302\n",
      "2\n",
      "GaussianNB  ACC: 0.8013333333333333\n",
      "GaussianNB  REC: 0.743988684582744\n",
      "GaussianNB  F: 0.6383495145631067\n",
      "0\n",
      "BernoulliNB  ACC: 0.8386487387487499\n",
      "BernoulliNB  REC: 0.4608208955223881\n",
      "BernoulliNB  F: 0.5764294049008168\n",
      "1\n",
      "BernoulliNB  ACC: 0.846\n",
      "BernoulliNB  REC: 0.47638888888888886\n",
      "BernoulliNB  F: 0.5975609756097561\n",
      "2\n",
      "BernoulliNB  ACC: 0.8463333333333334\n",
      "BernoulliNB  REC: 0.4837340876944837\n",
      "BernoulliNB  F: 0.5973799126637555\n",
      "0\n",
      "DecisionTree  ACC: 1.0\n",
      "DecisionTree  REC: 1.0\n",
      "DecisionTree  F: 1.0\n",
      "1\n",
      "DecisionTree  ACC: 0.975\n",
      "DecisionTree  REC: 0.9611111111111111\n",
      "DecisionTree  F: 0.9485949280328992\n",
      "2\n",
      "DecisionTree  ACC: 0.9736666666666667\n",
      "DecisionTree  REC: 0.958981612446959\n",
      "DecisionTree  F: 0.9449477351916377\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/svm/base.py:196: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.\n",
      "  \"avoid this warning.\", FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "SVM  ACC: 0.9523280364484943\n",
      "SVM  REC: 0.902518656716418\n",
      "SVM  F: 0.9002093510118633\n",
      "1\n",
      "SVM  ACC: 0.9513333333333334\n",
      "SVM  REC: 0.9\n",
      "SVM  F: 0.8987517337031901\n",
      "2\n",
      "SVM  ACC: 0.9586666666666667\n",
      "SVM  REC: 0.9137199434229137\n",
      "SVM  F: 0.9124293785310734\n",
      "0\n",
      "RandomForest  ACC: 0.9978886542949217\n",
      "RandomForest  REC: 0.9916044776119403\n",
      "RandomForest  F: 0.9955513931163662\n",
      "1\n",
      "RandomForest  ACC: 0.988\n",
      "RandomForest  REC: 0.9583333333333334\n",
      "RandomForest  F: 0.9745762711864409\n",
      "2\n",
      "RandomForest  ACC: 0.9853333333333333\n",
      "RandomForest  REC: 0.9462517680339463\n",
      "RandomForest  F: 0.9681620839363243\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/f/anaconda3/lib/python3.7/site-packages/sklearn/ensemble/forest.py:246: FutureWarning: The default value of n_estimators will change from 10 in version 0.20 to 100 in 0.22.\n",
      "  \"10 in version 0.20 to 100 in 0.22.\", FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "AdaBoost  ACC: 0.962884764973886\n",
      "AdaBoost  REC: 0.9183768656716418\n",
      "AdaBoost  F: 0.9218164794007491\n",
      "1\n",
      "AdaBoost  ACC: 0.9633333333333334\n",
      "AdaBoost  REC: 0.9083333333333333\n",
      "AdaBoost  F: 0.922425952045134\n",
      "2\n",
      "AdaBoost  ACC: 0.9596666666666667\n",
      "AdaBoost  REC: 0.9151343705799151\n",
      "AdaBoost  F: 0.914487632508834\n",
      "0\n",
      "LogisticRegression  ACC: 0.7918657628625403\n",
      "LogisticRegression  REC: 0.3516791044776119\n",
      "LogisticRegression  F: 0.44602188701567574\n",
      "1\n",
      "LogisticRegression  ACC: 0.7886666666666666\n",
      "LogisticRegression  REC: 0.3527777777777778\n",
      "LogisticRegression  F: 0.44483362521891423\n",
      "2\n",
      "LogisticRegression  ACC: 0.7826666666666666\n",
      "LogisticRegression  REC: 0.3437057991513437\n",
      "LogisticRegression  F: 0.42706502636203864\n"
     ]
    }
   ],
   "source": [
    "def hr_modeling(features, label):\n",
    "    from sklearn.model_selection import train_test_split\n",
    "    f_v = features.values\n",
    "    l_v = label.values\n",
    "    # 训练集，验证集，Y为标注\n",
    "    X_tt, X_validation, Y_tt, Y_validation = train_test_split(f_v, l_v, test_size=0.2)\n",
    "    # 从训练集再切割0.25为测试集\n",
    "    X_train, X_test, Y_train, Y_test = train_test_split(X_tt, Y_tt, test_size=0.25)\n",
    "    from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier # KNN\n",
    "    from sklearn.naive_bayes import GaussianNB, BernoulliNB # 高斯，泊努力  朴素贝叶斯(对离散值比较适合，这里也试试)\n",
    "    from sklearn.tree import DecisionTreeClassifier # 决策树、\n",
    "    from sklearn.svm import SVC # 支持向量机\n",
    "    from sklearn.ensemble import RandomForestClassifier # 随机森林\n",
    "    from sklearn.ensemble import AdaBoostClassifier # 集成\n",
    "    from sklearn.linear_model import LogisticRegression # 逻辑斯特回归(这里数据线性不可分，效果也不会理想，适合线性可分情况)\n",
    "    models = []\n",
    "    models.append(('KNN', KNeighborsClassifier(n_neighbors=3)))\n",
    "    models.append(('GaussianNB', GaussianNB()))\n",
    "    models.append(('BernoulliNB', BernoulliNB()))\n",
    "    models.append(('DecisionTree', DecisionTreeClassifier()))\n",
    "    models.append(('SVM', SVC(C=100))) # 惩罚度，数值越大，运算越谨慎，时间越长\n",
    "    models.append(('RandomForest', RandomForestClassifier()))\n",
    "    models.append(('AdaBoost', AdaBoostClassifier()))\n",
    "    models.append(('LogisticRegression', LogisticRegression(C=1000, solver='sag')))\n",
    "    for clf_name, clf in models:\n",
    "        clf.fit(X_train, Y_train)\n",
    "        xy_lst = [(X_train, Y_train), (X_validation, Y_validation), (X_test, Y_test)]\n",
    "        for i in range(len(xy_lst)):\n",
    "            X_part = xy_lst[i][0]\n",
    "            Y_part = xy_lst[i][1]\n",
    "            Y_predict = clf.predict(X_part)\n",
    "            print(i)\n",
    "            print(clf_name, ' ACC:',accuracy_score(Y_part, Y_predict))\n",
    "            print(clf_name, ' REC:', recall_score(Y_part, Y_predict))\n",
    "            print(clf_name, ' F:',f1_score(Y_part, Y_predict))\n",
    "\n",
    "features, label = hr_preprocessing()\n",
    "hr_modeling(features, label) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "从结果看出，这样的数据情况下，随机森林和决策树是表现比较好的，然后是KNN。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "剩下的就是对选定模型调惨优化了"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
